import pandas as pd
data = pd.read_csv("D:\DataAnalysisProjects-master\Instagram Reach Analysis Using Python\Instagram data.csv", encoding = 'latin1')
data
| Impressions | From Home | From Hashtags | From Explore | From Other | Saves | Comments | Shares | Likes | Profile Visits | Follows | Caption | Hashtags | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3920 | 2586 | 1028 | 619 | 56 | 98 | 9 | 5 | 162 | 35 | 2 | Here are some of the most important data visua... | #finance #money #business #investing #investme... |
| 1 | 5394 | 2727 | 1838 | 1174 | 78 | 194 | 7 | 14 | 224 | 48 | 10 | Here are some of the best data science project... | #healthcare #health #covid #data #datascience ... |
| 2 | 4021 | 2085 | 1188 | 0 | 533 | 41 | 11 | 1 | 131 | 62 | 12 | Learn how to train a machine learning model an... | #data #datascience #dataanalysis #dataanalytic... |
| 3 | 4528 | 2700 | 621 | 932 | 73 | 172 | 10 | 7 | 213 | 23 | 8 | Heres how you can write a Python program to d... | #python #pythonprogramming #pythonprojects #py... |
| 4 | 2518 | 1704 | 255 | 279 | 37 | 96 | 5 | 4 | 123 | 8 | 0 | Plotting annotations while visualizing your da... | #datavisualization #datascience #data #dataana... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 114 | 13700 | 5185 | 3041 | 5352 | 77 | 573 | 2 | 38 | 373 | 73 | 80 | Here are some of the best data science certifi... | #datascience #datasciencejobs #datasciencetrai... |
| 115 | 5731 | 1923 | 1368 | 2266 | 65 | 135 | 4 | 1 | 148 | 20 | 18 | Clustering is a machine learning technique use... | #machinelearning #machinelearningalgorithms #d... |
| 116 | 4139 | 1133 | 1538 | 1367 | 33 | 36 | 0 | 1 | 92 | 34 | 10 | Clustering music genres is a task of grouping ... | #machinelearning #machinelearningalgorithms #d... |
| 117 | 32695 | 11815 | 3147 | 17414 | 170 | 1095 | 2 | 75 | 549 | 148 | 214 | Here are some of the best data science certifi... | #datascience #datasciencejobs #datasciencetrai... |
| 118 | 36919 | 13473 | 4176 | 16444 | 2547 | 653 | 5 | 26 | 443 | 611 | 228 | 175 Python Projects with Source Code solved an... | #python #pythonprogramming #pythonprojects #py... |
119 rows × 13 columns
data.isnull().sum()
Impressions 0 From Home 0 From Hashtags 0 From Explore 0 From Other 0 Saves 0 Comments 0 Shares 0 Likes 0 Profile Visits 0 Follows 0 Caption 0 Hashtags 0 dtype: int64
data=data.dropna()
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 119 entries, 0 to 118 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Impressions 119 non-null int64 1 From Home 119 non-null int64 2 From Hashtags 119 non-null int64 3 From Explore 119 non-null int64 4 From Other 119 non-null int64 5 Saves 119 non-null int64 6 Comments 119 non-null int64 7 Shares 119 non-null int64 8 Likes 119 non-null int64 9 Profile Visits 119 non-null int64 10 Follows 119 non-null int64 11 Caption 119 non-null object 12 Hashtags 119 non-null object dtypes: int64(11), object(2) memory usage: 12.2+ KB
Analyzing the Reach based on the distribution of impression that have been recieved from home
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 10))
plt.style.use('fivethirtyeight')
plt.title("Distribution of Impression from Home")
sns.distplot(data['From Home'])
plt.show()
C:\Users\Deep Gayen\AppData\Local\Temp\ipykernel_19260\120578938.py:4: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(data['From Home'])
Analyzing the Reach based on the distribution of impression that have been recieved from Hashtags
plt.figure(figsize=(10,8))
plt.title("Distribution Of Impressions From Hashtags")
sns.distplot(data['From Hashtags'])
plt.show()
C:\Users\Deep Gayen\AppData\Local\Temp\ipykernel_19260\4174687964.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(data['From Hashtags'])
Analyzing the Reach based on the distribution of impression that have been recieved from Explore
plt.figure(figsize=(10,8))
plt.title("Distribution of Impression From Explore")
sns.distplot(data['From Explore'])
plt.show()
C:\Users\Deep Gayen\AppData\Local\Temp\ipykernel_19260\3473073779.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(data['From Explore'])
Percentage of Impression based on different sources on Instagram depicted using Pie chart
import plotly.express as px
home=data['From Home'].sum()
hashtags=data['From Hashtags'].sum()
explore=data['From Explore'].sum()
other=data['From Other'].sum()
labels=['From Home','From Hashtags','From Explore','From Other']
values=[home,hashtags,explore,other]
fig=px.pie(data,values=values,names=labels,title='Impression on Instagram Post From Various Sources',hole=0.5)
fig.show()
Relationship between the number of likes, Comments,Shares and Post Saves and the number of impressions on the Instagram post using Scatter graph
figure=px.scatter(data_frame = data, x="Impressions", y="Comments", size='Comments', trendline="ols", title="Relationship Between Comments and Impressions")
figure.show()
figure=px.scatter(data_frame = data, x="Impressions", y="Likes", size='Likes', trendline="ols", title="Relationship Between Likes and Impressions")
figure.show()
figure=px.scatter(data_frame = data, x="Impressions", y="Shares", size='Shares', trendline="ols", title="Relationship Between Shares and Impressions")
figure.show()
figure=px.scatter(data_frame = data, x="Impressions", y="Saves", size='Saves', trendline="ols", title="Relationship Between Post Saves and Impressions")
figure.show()
Correlation of all the Columns with Impression column
correlation = data.corr()
print(correlation["Impressions"].sort_values(ascending=False))
Impressions 1.000000 From Explore 0.893607 Follows 0.889363 Likes 0.849835 From Home 0.844698 Saves 0.779231 Profile Visits 0.760981 Shares 0.634675 From Other 0.592960 From Hashtags 0.560760 Comments -0.028524 Name: Impressions, dtype: float64
C:\Users\Deep Gayen\AppData\Local\Temp\ipykernel_19260\3935629544.py:1: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
The formula that you can use to calculate conversion rate is (Follows/Profile Visits) * 100. Now let’s have a look at the conversation rate of the Instagram account
conversion_rate = (data["Follows"].sum() / data["Profile Visits"].sum()) * 100
print(conversion_rate)
41.00265604249668
Relationship between the total profile visits and the number of followers gained from all profile visits
figure = px.scatter(data_frame = data, x="Profile Visits",
y="Follows", size="Follows", trendline="ols",
title = "Relationship Between Profile Visits and Followers Gained")
figure.show()
Training a machine learning model to predict the reach of an Instagram post
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveRegressor
x = np.array(data[['Likes', 'Saves', 'Comments', 'Shares',
'Profile Visits', 'Follows']])
y = np.array(data["Impressions"])
xtrain, xtest, ytrain, ytest = train_test_split(x, y,
test_size=0.2,
random_state=42)
Predict the reach of an Instagram post using Python
model = PassiveAggressiveRegressor()
model.fit(xtrain, ytrain)
model.score(xtest, ytest)
0.6644712240888775
Now let’s predict the reach of an Instagram post by giving inputs to the machine learning model
# Features = [['Likes','Saves', 'Comments', 'Shares', 'Profile Visits', 'Follows']]
features = np.array([[282.0, 233.0, 4.0, 9.0, 165.0, 54.0]])
model.predict(features)
array([8302.40022757])